# Preamble -------------

setwd("C:/Users/sheinig/switchdrive/INCA/INCA_R/")
library("install.load")
install_load(
  "tidyverse",
  "readxl",
  "viridis",
  "readr",
  "jsonlite",
  "purrr",
  "rlang",
  "ggraph",
  "igraph",
  "stringr",
  "scales",
  "corrplot",
  "patchwork"
)

theme_settings <- theme_classic(base_size = 12) +
  theme(
    panel.grid.minor = element_line(color = "gray70", linewidth = 0.2),
    panel.grid.major = element_line(color = "gray70", linewidth = 0.2),
    legend.position = "bottom",
    legend.margin = margin(-5, 0, 0, 0)
  )


get_viridis_color <- function(n) {
  viridis(
    option = "viridis",
    direction = 1,
    begin = 0.3,
    end = 0.7,
    n = n
  )
}

# Dictionary -------------
dict <- list(
  data_path = "../inca_2025_12_15/",

  file_recipients = "customers.xlsx",
  file_caregivers = "care_givers.xlsx",
  file_relationship = "customer_caregivers.xlsx",
  file_care_prescription = "customer_prescriptions.xlsx",
  file_tasks = "customer_documentations.xlsx",
  file_inter_rai_first_app = "customer_care_situation_answers.xlsx",
  file_inter_rai = "customer_inter_rai_home_care_answers.xlsx",
  file_inter_rai_codebook = "inter_rai_home_care_questions.xlsx",
  file_promis = "customer_promis_answers.xlsx",
  file_text_extractions = "customer_document_text_extractions.xlsx",
  file_diagnoses = "customer_diagnoses.xlsx",
  file_medications = "customer_medications.xlsx",
  file_appointments = "appointments.xlsx",
  
  sheet = "Sheet1",
  NA_char = "NA",

  # column renames
  caregivers_lookup = c(
    "id_caregiver" = "id"
  ),

  recipients_lookup = c(
    "id_recipient" = "id"
  ),

  relationship_lookup = c(
    "id_relationship" = "id",
    "id_recipient" = "customer_id",
    "id_caregiver" = "care_giver_id"
  ),

  care_prescription_lookup = c(
    "id_care_prescription" = "id",
    "id_recipient" = "customer_id",
    "city" = "customer_city",
    "zip" = "customer_zip",
    "disability_compensation" = "customer_disability_compensation",
    "intensive_care_surcharge" = "customer_intensive_care_surcharge",
    "is_other_spitex_involved" = "customer_is_other_spitex_involved",
    "customer_other_spitex_name" = "customer_other_spitex_name"
  ),

  tasks_lookup = c(
    "id_task" = "id",
    "id_recipient" = "customer_id",
    "id_care_prescription" = "customer_prescription_id",
    "id_care_prescription_task" = "customer_prescription_task_id"
  ),

  interRai_lookup = c(
    "id_recipient" = "customer_id",
    "id_appointment" = "appointment_id",
    "id_interRai" = "inter_rai_home_care_question_id"
  ),

  promis_lookup = c(
    "id_recipient" = "customer_id",
    "id_promis" = "promis_question_id"
  ),
  
  file_lookup = c(
    "id_recipient" = "customer_id",
    "id_file" = "customer_file_id"
  ),
  
  appointments_lookup = c(
    "id_appointment" = "id",
    "id_consultant" = "consultant_id",
    "id_recipient" = "customer_id"
  ),

  # Plot formatting
  plot_device = "png",
  plot_dpi = 300,
  desc_width = 12,
  desc_height = 6,
  desc_units = "cm",

  # codiag formatting
  colour_start = 0.1,
  colour_end = 0.9,
  cd_width = 33,
  cd_height = 20,

  # factor level ordering
  factor_levels = c(
    "Yes",
    "No",
    "VeryGood",
    "Good",
    "Normal",
    "Bad",
    "VeryBad",
    "fourHours",
    "sixHours",
    "eightHours",
    "Light",
    "Medium",
    "Heavy"
  )
)

# Plotting functions ------------

create_summary <- function(input_data, group_column = NA) {
  if (is.na(group_column)) {
    input_data <- input_data %>% mutate(group_column = 1)
    group_column = "group_column"
  }
  input_data %>%
    select(-starts_with("id_")) %>%
    mutate(across(everything(), as.character)) %>%
    pivot_longer(
      cols = -group_column,
      names_to = "variable",
      values_to = "value"
    ) %>%
    replace_na(list(value = "Unknown")) %>%
    count(
      across(all_of(c(group_column, "variable", "value"))),
      name = "count"
    ) %>%
    group_by(across(all_of(c(group_column, "variable")))) %>%
    mutate(
      percentage = count / sum(count),
      value = case_when(
        value == TRUE ~ "Yes",
        value == FALSE ~"No",
        .default = value
      )
    ) %>%
    ungroup()
}

desc_plot_bar <- function(
  plot_data,
  plot_variable,
  variable_label,
  group_column = "group_column",
  plot_width = NA,
  plot_height = NA
) {
  plot_df = get(paste0(plot_data, "_summary_table"), envir = .GlobalEnv) %>%
    filter(variable == plot_variable) %>%
    mutate(
      value = factor(
        value,
        levels = c(
          dict$factor_levels,
          sort(setdiff(unique(value), c(dict$factor_levels, "Unknown"))),
          "Unknown"
        ),
        ordered = TRUE
      )
    )

  all_values <- levels(droplevels(plot_df$value))
  n_groups <- length(unique(plot_df[[group_column]]))

  plot_df <- plot_df %>%
    complete(
      !!sym(group_column),
      variable,
      value = all_values,
      fill = list(count = 0, percentage = 0)
    )
  plot_df[, group_column] = factor(
    plot_df[[group_column]],
    levels = sort(unique(plot_df[[group_column]]))
  )

  p <- ggplot(
    data = plot_df,
    aes(x = fct_rev(value), y = percentage)
  ) +
    theme_settings +
    geom_col(
      aes(fill = .data[[group_column]]),
      position = position_dodge()
    ) +
    scale_fill_manual(
      values = get_viridis_color(n_groups),
      name = NULL
    ) +
    scale_y_continuous(
      labels = scales::percent_format(),
      limits = c(0, 1),
      expand = expansion(mult = c(0, 0.05))
    ) +
    geom_text(
      aes(
        label = scales::percent(percentage, accuracy = 1),
        group = .data[[group_column]]
      ),
      position = position_dodge(width = 1),
      hjust = -0.15,
      size = 10 / .pt
    ) +
    labs(
      x = variable_label,
      y = NULL
    ) +
    coord_flip() +
    theme(
      panel.grid.major.y = element_blank(),
      panel.grid.minor.y = element_blank(),
      axis.title.y = element_text(margin = margin(r = 8)),
      legend.position = if (group_column != "group_column") "bottom" else "none"
    )

  ggsave(
    plot = p,
    filename = paste0(
      "../Plots/Descriptives/",
      plot_data,
      "_bar_",
      plot_variable,
      ".",
      dict$plot_device
    ),
    device = dict$plot_device,
    dpi = dict$plot_dpi,
    width = if (is.na(plot_width)) dict$desc_width else plot_width,
    height = if (is.na(plot_height)) dict$desc_height else plot_height,
    units = dict$desc_units
  )
}

desc_plot_col <- function(
  plot_data,
  plot_variable,
  variable_label,
  group_column = "group_column",
  ticks = NA
) {
  plot_df = get(paste0(plot_data, "_summary_table"), envir = .GlobalEnv) %>%
    filter(variable == plot_variable)

  if (grepl("date", plot_variable)) {
    plot_df <- plot_df %>%
      mutate(value = as.Date(value))
  } else {
    plot_df <- plot_df %>%
      mutate(value = parse_number(value, na = c("null", "NA", "", "Unknown")))
  }

  plot_df <- plot_df %>%
    filter(!is.na(value)) %>%
    group_by(.data[[group_column]]) %>%
    mutate(
      percentage = percentage / sum(percentage)
    )

  plot_df[, group_column] = factor(
    plot_df[[group_column]],
    levels = sort(unique(plot_df[[group_column]]))
  )
  n_groups <- length(unique(plot_df[[group_column]]))

  p <- ggplot(
    data = plot_df %>%
      na.omit(),
    aes(x = value, y = percentage)
  ) +
    theme_settings +
    scale_y_continuous(
      expand = expansion(mult = c(0, 0.2)),
      labels = scales::percent_format()
    ) +
    geom_col(
      aes(fill = .data[[group_column]]),
      position = "identity", # bars plotted on top of each other
      alpha = if (group_column != "group_column") 0.8 else 1
    ) +
    scale_fill_manual(
      values = get_viridis_color(n_groups),
      name = NULL
    ) +
    labs(
      x = variable_label,
      y = NULL
    ) +
    theme(
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank(),
      axis.title.x = element_text(margin = margin(t = 6)),
      legend.position = if (group_column != "group_column") "bottom" else "none"
    )
  if (!is.na(ticks[[1]])) {
    p <- p +
      scale_x_continuous(
        breaks = ticks
      )
  }

  ggsave(
    plot = p,
    filename = paste0(
      "../Plots/Descriptives/",
      plot_data,
      "_col_",
      plot_variable,
      ".",
      dict$plot_device
    ),
    device = dict$plot_device,
    dpi = dict$plot_dpi,
    width = dict$desc_width,
    height = dict$desc_height,
    units = dict$desc_units
  )
}

draw_plot <- function(
  set_level,
  dict,
  vertices,
  connect,
  hierarchy
) {
  graph_vertices <- vertices %>% filter(level %in% c(0, set_level))
  mygraph <- graph_from_data_frame(
    hierarchy %>% filter(level == set_level),
    vertices = graph_vertices
  )
  layout <- create_layout(
    mygraph,
    layout = 'linear',
    circular = TRUE,
    weight = share
  )

  vert_margin = if (set_level == 2) 0.5 else 0.2
  hor_margin = if (set_level == 2) 3.5 else 3

  connect_from = connect %>% filter(level == set_level) %>% pull(from)
  connect_to = connect %>% filter(level == set_level) %>% pull(to)
  connect_share = connect %>% filter(level == set_level) %>% pull(share)
  connect_factor = connect %>% filter(level == set_level) %>% pull(factor)

  if (set_level == 2) {
    vertices_outer = vertices %>% filter(level %in% c(0, 1))
    mygraph_outer <- graph_from_data_frame(
      hierarchy %>% filter(level == 1),
      vertices = vertices_outer
    )
    layout_outer <- create_layout(
      mygraph_outer,
      layout = 'linear',
      circular = TRUE,
      weight = share_2
    )
  }

  p <- ggraph(layout) +
    geom_conn_bundle(
      data = get_con(
        from = match(connect_from, graph_vertices$name),
        to = match(connect_to, graph_vertices$name)
      ),
      aes(
        edge_width = rep(connect_share, each = 3),
        edge_colour = if (set_level == 2) {
          rep(sign(connect_factor) * abs(connect_factor)^5, each = 3)
        } else {
          rep(sign(connect_factor) * abs(connect_factor)^4, each = 3)
        },
      ),
      alpha = 0.7,
      tension = 0.3,
    ) +
    scale_edge_colour_gradient2(
      low = turbo(2, begin = 0.05, end = 0.95)[2],
      mid = "white",
      high = turbo(2, begin = 0.05, end = 0.95)[1],
      midpoint = 0,
      name = NULL,
      guide = guide_edge_colourbar(
        title = NULL,
        barwidth = 20,
        barheight = 0.5,
        ticks = FALSE,
        label.position = "bottom",
        label.theme = element_text(size = 10)
      ),
      breaks = c(-1, 0, 1),
      labels = c(
        "rarely co-diagnosed",
        "as expected",
        "excessively co-diagnosed"
      )
    ) +
    scale_edge_width(trans = "exp") +
    geom_node_arc_bar(
      data = layout,
      aes(
        filter = (level == set_level),
        r0 = ifelse(set_level == 2, 1.21, 1.05),
        r = ifelse(set_level == 2, 1.28, 1.12),
        fill = vert_colour
      ),
      colour = "white"
    ) +
    geom_node_text(
      data = if (set_level == 2) layout_outer else layout,
      aes(
        filter = if (set_level == 2) (share_2 > 0.004) else (share > 0.004),
        label = vert_label,
        x = if (set_level == 2) x * 1.4 else x * 1.17,
        y = if (set_level == 2) y * 1.44 else y * 1.2,
        hjust = ifelse(
          x >= 0,
          0,
          1
        )
      ),
      lineheight = 0.8,
      size = 3.5,
    )

  if (set_level == 2) {
    p <- p +
      geom_node_text(
        data = layout,
        aes(
          label = name,
          filter = (share > 0.005),
          x = x * 1.05,
          y = y * 1.05,
          angle = ifelse(
            x >= 0,
            asin(y) * 360 / 2 / pi,
            360 - asin(y) * 360 / 2 / pi
          ),
          hjust = ifelse(
            x >= 0,
            0,
            1
          )
        ),
        size = 2.5,
      ) +
      geom_node_arc_bar(
        data = layout_outer,
        aes(
          filter = (level == 1),
          r0 = 1.29,
          r = 1.35,
          fill = vert_colour
        ),
        colour = "white"
      )
  }

  p <- p +
    scale_fill_identity() +
    guides(edge_width = "none") +
    theme_void() +
    coord_equal(clip = "off") +
    theme(
      legend.position = "bottom",
      legend.margin = margin(t = -10, b = 10),
      legend.box = "horizontal",
      plot.margin = unit(
        c(vert_margin, hor_margin, vert_margin, hor_margin),
        "cm"
      )
    )

  ggsave(
    plot = p,
    filename = paste0(
      "../Plots/Descriptives/Codiag_level_",
      set_level,
      ".",
      dict$plot_device
    ),
    device = dict$plot_device,
    dpi = dict$plot_dpi,
    width = dict$cd_width,
    height = dict$cd_height,
    units = dict$desc_units
  )

  p
}

propr.maut.function.201709 <- function(thetas){ 
  
  # DESCRIPTION This is the multi-attribute utility function, using isotonic
  # regression with linear interpolation for the single-attribute (dis)utility
  # functions. This code was written by Barry Dewitt in September 2017 using R
  # 3.4.0.
  
  # INPUTS
  # The input thetas must be a 7 element vector with the following components
  # (in order):
  #   - theta_cog is a score on the Cognitive Functioning - Abilities domain
  #   - theta_dep is a score on the Depression domain
  #   - theta_fat is a score on the Fatigue domain
  #   - theta_pain is a score on the Pain Interference domain
  #   - theta_phys is a score on the Physical Functioning domain
  #   - theta_slp is a score on the Sleep Disturbance domain
  #   - theta_sr is a score on the Ability to Participate in Social Roles 
  # and Activities domain
  
  # The thetas must be of the z-score form: usually a number from -3 to 3.
  # These are the scores constructed with a population mean of 0 and standard
  # deviation of 1. Note in particular, they should not be the "t-score" form.
  # These scores are transformations of the z-scores such that the population
  # mean is 50 and a standard deviation of 10.
  
  # OUTPUTS
  # A list with the following components:
  
  # PROPr -- A number (utility) on the dead = 0, full health = 1 scale. (Note
  # that 1 is the maximum possible value, but scores less than 0 are possible.)
  # One single-attribute utility score for each domain, where the utility of
  # the (disutility) corner state = 0, and full health = 1. (Note scores are
  # bounded by 0 and 1 for the single-attribute scales.) These components are
  # labeled by the domain names.
  
  # Label input components
  theta_cog <- thetas[1]
  theta_dep <- thetas[2]
  theta_fat <- thetas[3]
  theta_pain <- thetas[4]
  theta_phys <- thetas[5]
  theta_slp <- thetas[6]
  theta_sr <- thetas[7]
  
  # Values where the line segments of the isotonic regression with interpolation
  # change
  
  turncog1 <- -2.052
  turncog2 <- -1.565
  turncog3 <- -1.239
  turncog4 <- -0.902
  turncog5 <- -0.649
  turncog6 <- -0.367
  turncog7 <- -0.002
  turncog8 <- 0.52
  turncog9 <- 1.124
  
  turndep1 <- -1.082
  turndep2 <- -0.264
  turndep3 <- 0.151
  turndep4 <- 0.596
  turndep5 <- 0.913
  turndep6 <- 1.388
  turndep7 <- 1.742
  turndep8 <- 2.245
  turndep9 <- 2.703
  
  turnfat1 <- -1.648
  turnfat2 <- -0.818
  turnfat3 <- -0.094
  turnfat4 <- 0.303
  turnfat5 <- 0.87
  turnfat6 <- 1.124
  turnfat7 <- 1.688
  turnfat8 <- 2.053
  turnfat9 <- 2.423
  
  turnpain1 <- -0.773
  turnpain2 <- 0.1
  turnpain3 <- 0.462
  turnpain4 <- 0.827
  turnpain5 <- 1.072
  turnpain6 <- 1.407
  turnpain7 <- 1.724
  turnpain8 <- 2.169
  turnpain9 <- 2.725
  
  turnphys1 <- -2.575
  turnphys2 <- -2.174
  turnphys3 <- -1.784
  turnphys4 <- -1.377
  turnphys5 <- -0.787
  turnphys6 <- -0.443
  turnphys7 <- -0.211
  turnphys8 <- 0.16
  turnphys9 <- 0.966
  
  turnsleep1 <- -1.535
  turnsleep2 <- -0.775
  turnsleep3 <- -0.459
  turnsleep4 <- 0.093
  turnsleep5 <- 0.335
  turnsleep6 <- 0.82
  turnsleep7 <- 1.659
  turnsleep8 <- 1.934
  
  turnsocial1 <- -2.088
  turnsocial2 <- -1.634
  turnsocial3 <- -1.293
  turnsocial4 <- -0.955
  turnsocial5 <- -0.618
  turnsocial6 <- -0.276
  turnsocial7 <- 0.083
  turnsocial8 <- 0.494
  turnsocial9 <- 1.221
  
  # Slopes of each line segment specification
  
  slopecog1 <- -1.0047
  slopecog2 <- -0.1745
  slopecog3 <- -0.4223
  slopecog4 <- -0.1949
  slopecog5 <- -0.1082
  slopecog6 <- -0.2468
  slopecog7 <- -0.0176
  slopecog8 <- -0.2192
  
  
  slopedep1 <- 0.1572
  slopedep2 <- 0
  slopedep3 <- 0.1793
  slopedep4 <- 0.1817
  slopedep5 <- 0.4109
  slopedep6 <- 0.1887
  slopedep7 <- 0.2115
  slopedep8 <- 0.7983
  
  
  slopefat1 <- 0.1152
  slopefat2 <- 0.1077
  slopefat3 <- 0.1189
  slopefat4 <- 0.1277
  slopefat5 <- 0.222
  slopefat6 <- 0.0496
  slopefat7 <- 0.3233
  slopefat8 <- 1.3632
  
  
  slopepain1 <- 0.0891
  slopepain2 <- 0.1721
  slopepain3 <- 0.1022
  slopepain4 <- 0.4241
  slopepain5 <- 0.3815
  slopepain6 <- 0.3681
  slopepain7 <- 0.1169
  slopepain8 <- 0.7594
  
  
  slopephys1 <- -1.0761
  slopephys2 <- -0.1756
  slopephys3 <- -0.1764
  slopephys4 <- -0.1161
  slopephys5 <- -0.2721
  slopephys6 <- -0.4082
  slopephys7 <- -0.1695
  slopephys8 <- -0.1346
  
  
  slopesleep1 <- 0.1241
  slopesleep2 <- 0
  slopesleep3 <- 0.0797
  slopesleep4 <- 0.3455
  slopesleep5 <- 0.3148
  slopesleep6 <- 0.1238
  slopesleep7 <- 1.8964
  
  
  slopesocial1 <- -1.1152
  slopesocial2 <- -0.2874
  slopesocial3 <- -0.1352
  slopesocial4 <- -0.132
  slopesocial5 <- -0.4012
  slopesocial6 <- 0
  slopesocial7 <- -0.054
  slopesocial8 <- -0.201
  
  
  # Intercepts of each line segment specification
  
  interceptcog1 <- -1.0617
  interceptcog2 <- 0.2375
  interceptcog3 <- -0.0694
  interceptcog4 <- 0.1357
  interceptcog5 <- 0.192
  interceptcog6 <- 0.1411
  interceptcog7 <- 0.1416
  interceptcog8 <- 0.2464
  
  
  interceptdep1 <- 0.1701
  interceptdep2 <- 0.1286
  interceptdep3 <- 0.1015
  interceptdep4 <- 0.1001
  interceptdep5 <- -0.1092
  interceptdep6 <- 0.1993
  interceptdep7 <- 0.1595
  interceptdep8 <- -1.1577
  
  
  interceptfat1 <- 0.1898
  interceptfat2 <- 0.1837
  interceptfat3 <- 0.1848
  interceptfat4 <- 0.1821
  interceptfat5 <- 0.1
  interceptfat6 <- 0.2938
  interceptfat7 <- -0.1681
  interceptfat8 <- -2.3031
  
  
  interceptpain1 <- 0.0689
  interceptpain2 <- 0.0606
  interceptpain3 <- 0.0929
  interceptpain4 <- -0.1733
  interceptpain5 <- -0.1277
  interceptpain6 <- -0.1089
  interceptpain7 <- 0.3243
  interceptpain8 <- -1.0692
  
  
  interceptphys1 <- -1.7709
  interceptphys2 <- 0.1867
  interceptphys3 <- 0.1853
  interceptphys4 <- 0.2683
  interceptphys5 <- 0.1456
  interceptphys6 <- 0.0853
  interceptphys7 <- 0.1356
  interceptphys8 <- 0.13
  
  
  interceptsleep1 <- 0.1905
  interceptsleep2 <- 0.0943
  interceptsleep3 <- 0.1309
  interceptsleep4 <- 0.1062
  interceptsleep5 <- 0.1164
  interceptsleep6 <- 0.2731
  interceptsleep7 <- -2.6676
  
  
  interceptsocial1 <- -1.3285
  interceptsocial2 <- 0.0241
  interceptsocial3 <- 0.2209
  interceptsocial4 <- 0.2239
  interceptsocial5 <- 0.0576
  interceptsocial6 <- 0.1683
  interceptsocial7 <- 0.1728
  interceptsocial8 <- 0.2454
  
  
  # Corner state disutility values
  c_cognition <-  0.6350450
  c_depression <-  0.6661641
  c_fatigue <-  0.6386135
  c_pain <-  0.6529680
  c_physical <-  0.6883584
  c_sleep <-  0.5629657
  c_social <-  0.6112686
  C <-  -0.9991828
  
  # Constant for transforming from all-worst = 0 to dead = 0
  to_dead <-  1.021915
  
  # Create output of each single-attribute disutility function
  
  # Cognition Disutility.  Higher cognition scores are better.
  cog_disutility <- 1
  if(turncog1<=theta_cog & theta_cog <turncog2) {cog_disutility <- 
    interceptcog1 + theta_cog * slopecog1}
  if(turncog2<=theta_cog & theta_cog <turncog3) {cog_disutility <- 
    interceptcog2 + theta_cog * slopecog2}
  if(turncog3<=theta_cog & theta_cog <turncog4) {cog_disutility <- 
    interceptcog3 + theta_cog * slopecog3}
  if(turncog4<=theta_cog & theta_cog <turncog5) {cog_disutility <- 
    interceptcog4 + theta_cog * slopecog4}
  if(turncog5<=theta_cog & theta_cog <turncog6) {cog_disutility <- 
    interceptcog5 + theta_cog * slopecog5}
  if(turncog6<=theta_cog & theta_cog <turncog7) {cog_disutility <- 
    interceptcog6 + theta_cog * slopecog6}
  if(turncog7<=theta_cog & theta_cog <turncog8) {cog_disutility <- 
    interceptcog7 + theta_cog * slopecog7}
  if(turncog8<=theta_cog & theta_cog <turncog9) {cog_disutility <- 
    interceptcog8 + theta_cog * slopecog8}
  if(turncog9<=theta_cog) {cog_disutility <- 0}
  
  # Depression Disutility.  Lower depression scores are better
  dep_disutility <- 0
  if(turndep1<=theta_dep & theta_dep <turndep2) {dep_disutility <- 
    interceptdep1 + theta_dep * slopedep1}
  if(turndep2<=theta_dep & theta_dep <turndep3) {dep_disutility <- 
    interceptdep2 + theta_dep * slopedep2}
  if(turndep3<=theta_dep & theta_dep <turndep4) {dep_disutility <- 
    interceptdep3 + theta_dep * slopedep3}
  if(turndep4<=theta_dep & theta_dep <turndep5) {dep_disutility <- 
    interceptdep4 + theta_dep * slopedep4}
  if(turndep5<=theta_dep & theta_dep <turndep6) {dep_disutility <- 
    interceptdep5 + theta_dep * slopedep5}
  if(turndep6<=theta_dep & theta_dep <turndep7) {dep_disutility <- 
    interceptdep6 + theta_dep * slopedep6}
  if(turndep7<=theta_dep & theta_dep <turndep8) {dep_disutility <- 
    interceptdep7 + theta_dep * slopedep7}
  if(turndep8<=theta_dep & theta_dep <turndep9) {dep_disutility <- 
    interceptdep8 + theta_dep * slopedep8}
  if(turndep9<=theta_dep) {dep_disutility <- 1}
  
  
  # Fatigue Disutility.  Lower fatigue scores are better
  fat_disutility <- 0
  if(turnfat1<=theta_fat & theta_fat <turnfat2) {fat_disutility <- 
    interceptfat1 + theta_fat * slopefat1}
  if(turnfat2<=theta_fat & theta_fat <turnfat3) {fat_disutility <- 
    interceptfat2 + theta_fat * slopefat2}
  if(turnfat3<=theta_fat & theta_fat <turnfat4) {fat_disutility <- 
    interceptfat3 + theta_fat * slopefat3}
  if(turnfat4<=theta_fat & theta_fat <turnfat5) {fat_disutility <- 
    interceptfat4 + theta_fat * slopefat4}
  if(turnfat5<=theta_fat & theta_fat <turnfat6) {fat_disutility <- 
    interceptfat5 + theta_fat * slopefat5}
  if(turnfat6<=theta_fat & theta_fat <turnfat7) {fat_disutility <- 
    interceptfat6 + theta_fat * slopefat6}
  if(turnfat7<=theta_fat & theta_fat <turnfat8) {fat_disutility <- 
    interceptfat7 + theta_fat * slopefat7}
  if(turnfat8<=theta_fat & theta_fat <turnfat9) {fat_disutility <- 
    interceptfat8 + theta_fat * slopefat8}
  if(turnfat9<=theta_fat) {fat_disutility <- 1}
  
  
  #  Pain Disutility.  Lower pain scores are better
  pain_disutility <- 0
  if(turnpain1<=theta_pain & theta_pain <turnpain2) {pain_disutility <- 
    interceptpain1 + theta_pain * slopepain1}
  if(turnpain2<=theta_pain & theta_pain <turnpain3) {pain_disutility <- 
    interceptpain2 + theta_pain * slopepain2}
  if(turnpain3<=theta_pain & theta_pain <turnpain4) {pain_disutility <- 
    interceptpain3 + theta_pain * slopepain3}
  if(turnpain4<=theta_pain & theta_pain <turnpain5) {pain_disutility <- 
    interceptpain4 + theta_pain * slopepain4}
  if(turnpain5<=theta_pain & theta_pain <turnpain6) {pain_disutility <- 
    interceptpain5 + theta_pain * slopepain5}
  if(turnpain6<=theta_pain & theta_pain <turnpain7) {pain_disutility <- 
    interceptpain6 + theta_pain * slopepain6}
  if(turnpain7<=theta_pain & theta_pain <turnpain8) {pain_disutility <- 
    interceptpain7 + theta_pain * slopepain7}
  if(turnpain8<=theta_pain & theta_pain <turnpain9) {pain_disutility <- 
    interceptpain8 + theta_pain * slopepain8}
  if(turnpain9<=theta_pain) {pain_disutility <- 1}
  
  
  #  Physical Disutility.  Higher physical function scores are better
  physical_disutility <- 1
  if(turnphys1<=theta_phys & theta_phys <turnphys2) {physical_disutility <- 
    interceptphys1 + theta_phys * slopephys1}
  if(turnphys2<=theta_phys & theta_phys <turnphys3) {physical_disutility <- 
    interceptphys2 + theta_phys * slopephys2}
  if(turnphys3<=theta_phys & theta_phys <turnphys4) {physical_disutility <- 
    interceptphys3 + theta_phys * slopephys3}
  if(turnphys4<=theta_phys & theta_phys <turnphys5) {physical_disutility <- 
    interceptphys4 + theta_phys * slopephys4}
  if(turnphys5<=theta_phys & theta_phys <turnphys6) {physical_disutility <- 
    interceptphys5 + theta_phys * slopephys5}
  if(turnphys6<=theta_phys & theta_phys <turnphys7) {physical_disutility <- 
    interceptphys6 + theta_phys * slopephys6}
  if(turnphys7<=theta_phys & theta_phys <turnphys8) {physical_disutility <- 
    interceptphys7 + theta_phys * slopephys7}
  if(turnphys8<=theta_phys & theta_phys <turnphys9) {physical_disutility <- 
    interceptphys8 + theta_phys * slopephys8}
  if(turnphys9<=theta_phys) {physical_disutility <- 0}
  
  
  #  Sleep Disutility.  Lower sleep disturbance scores are better
  sleep_disutility <- 0
  if(turnsleep1<=theta_slp & theta_slp <turnsleep2) {sleep_disutility <- 
    interceptsleep1 + theta_slp * slopesleep1}
  if(turnsleep2<=theta_slp & theta_slp <turnsleep3) {sleep_disutility <- 
    interceptsleep2 + theta_slp * slopesleep2}
  if(turnsleep3<=theta_slp & theta_slp <turnsleep4) {sleep_disutility <- 
    interceptsleep3 + theta_slp * slopesleep3}
  if(turnsleep4<=theta_slp & theta_slp <turnsleep5) {sleep_disutility <- 
    interceptsleep4 + theta_slp * slopesleep4}
  if(turnsleep5<=theta_slp & theta_slp <turnsleep6) {sleep_disutility <- 
    interceptsleep5 + theta_slp * slopesleep5}
  if(turnsleep6<=theta_slp & theta_slp <turnsleep7) {sleep_disutility <- 
    interceptsleep6 + theta_slp * slopesleep6}
  if(turnsleep7<=theta_slp & theta_slp <turnsleep8) {sleep_disutility <- 
    interceptsleep7 + theta_slp * slopesleep7}
  if(turnsleep8<=theta_slp) {sleep_disutility <- 1}
  
  #  Social Disutility.  Higher social scores are better
  social_disutility <- 1
  if(turnsocial1<=theta_sr & theta_sr <turnsocial2) {social_disutility <- 
    interceptsocial1 + theta_sr * slopesocial1}
  if(turnsocial2<=theta_sr & theta_sr <turnsocial3) {social_disutility <- 
    interceptsocial2 + theta_sr * slopesocial2}
  if(turnsocial3<=theta_sr & theta_sr <turnsocial4) {social_disutility <- 
    interceptsocial3 + theta_sr * slopesocial3}
  if(turnsocial4<=theta_sr & theta_sr <turnsocial5) {social_disutility <- 
    interceptsocial4 + theta_sr * slopesocial4}
  if(turnsocial5<=theta_sr & theta_sr <turnsocial6) {social_disutility <- 
    interceptsocial5 + theta_sr * slopesocial5}
  if(turnsocial6<=theta_sr & theta_sr <turnsocial7) {social_disutility <- 
    interceptsocial6 + theta_sr * slopesocial6}
  if(turnsocial7<=theta_sr & theta_sr <turnsocial8) {social_disutility <- 
    interceptsocial7 + theta_sr * slopesocial7}
  if(turnsocial8<=theta_sr & theta_sr <turnsocial9) {social_disutility <- 
    interceptsocial8 + theta_sr * slopesocial8}
  if(turnsocial9<=theta_sr) {social_disutility <- 0}
  
  
  # Now, plug it into the multiattribute disutility function
  
  multi_attribute_disutility <- 
    (1/C) * ((1 + C * c_cognition * cog_disutility)*
               (1 + C * c_depression * dep_disutility)*
               (1 + C * c_fatigue * fat_disutility)*
               (1 + C * c_pain * pain_disutility)*
               (1 + C * c_physical * physical_disutility)*
               (1 + C * c_sleep * sleep_disutility)*
               (1 + C * c_social * social_disutility) - 1)
  
  
  # Now make it a utility, on the dead/full health scale 
  PROPr <- round(1 - to_dead * multi_attribute_disutility, 3)
  
  # Single attribute utility functions 
  cognition_utility <- round(1 - cog_disutility, 3)
  depression_utility <- round(1 - dep_disutility, 3)
  fatigue_utility <- round(1 - fat_disutility, 3)
  pain_utility <- round(1 - pain_disutility, 3)
  physical_utility <- round(1 - physical_disutility, 3)
  sleep_utility <- round(1 - sleep_disutility, 3)
  social_utility <- round(1 - social_disutility, 3)
  
  
  # Return PROPr multi-attribute score on dead-full health scale, and individual 
  # scores on each of the single-attribute functions, where 0 = disutility 
  # corner state and 1 = full health.
  propr.values <- list(PROPr = PROPr,
                       cognition = cognition_utility,
                       depression = depression_utility,
                       fatigue = fatigue_utility,
                       pain = pain_utility,
                       physical = physical_utility,
                       sleep = sleep_utility,
                       social = social_utility)
  
  return(propr.values)
}


# Load data ------------

# Caregivers
Caregivers_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_caregivers),
  sheet = dict$sheet
) %>%
  select(-ahv) %>%
  mutate(
    # year_of_birth = parse_number(
    #   year_of_birth,
    #   na = c("null", "NA", "", "Unknown")
    # ),
    # has_second_job = ifelse(
    #   has_second_job == "WAHR",
    #   "Yes",
    #   ifelse(has_second_job == "FALSCH", "No", has_second_job)
    # ),
    # is_tariff_b_qualified = ifelse(is_tariff_b_qualified, "Yes", "No"),
    across(where(is.character), ~ na_if(., dict$NA_char)),
    gender = na_if(gender, "Unknown")
  )
Caregivers_rename <- rename(Caregivers_raw, all_of(dict$caregivers_lookup))

# Recipients
Recipients_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_recipients),
  sheet = dict$sheet
) %>%
  select(-ahv) %>%
  mutate(
    #is_other_spitex_involved = ifelse(is_other_spitex_involved, "Yes", "No")
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
Recipients_rename <- rename(Recipients_raw, all_of(dict$recipients_lookup))

# Relationship
Relationship_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_relationship),
  sheet = dict$sheet
) %>%
  select(-created_by, -updated_by, -created_at, -updated_at) %>%
  mutate(
    # active_from = as.Date(active_from, origin = "1899-12-30"),
    # active_to = as.Date(
    #   parse_number(active_to, na = c("null", "NA", "")),
    #   origin = "1899-12-30"
    # ),
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
Relationship_rename <- rename(
  Relationship_raw,
  all_of(dict$relationship_lookup)
)

# Care_prescription
Care_prescription_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_care_prescription),
  sheet = dict$sheet
) %>%
  mutate(
    # start_date = as.Date(start_date, origin = "1899-12-30"),
    # end_date = as.Date(
    #   parse_number(end_date, na = c("null", "NA", "")),
    #   origin = "1899-12-30"
    # ),
    estimated_recurring_monthly_hours = ceiling(
      estimated_recurring_monthly_minutes / 60
    ),
    # customer_is_other_spitex_involved = case_when(
    #   customer_is_other_spitex_involved == TRUE ~ "Yes",
    #   customer_is_other_spitex_involved == FALSE ~ "No",
    #   .default = NA
    # ),
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
Care_prescription_rename <- rename(
  Care_prescription_raw,
  all_of(dict$care_prescription_lookup)
)

# Tasks
Tasks_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_tasks),
  sheet = dict$sheet
) %>%
  select(
    -task_description,
    -task_difficulties,
    -customer_mood_reason,
    -created_by,
    -created_at,
    -updated_at,
    -updated_by
  ) %>%
  mutate(
    across(where(is.character), ~ na_if(., dict$NA_char))
  )

Tasks_rename <- rename(Tasks_raw, all_of(dict$tasks_lookup))

# interRai codebook
InterRai_codebook <- read_xlsx(
  paste0(dict$data_path, dict$file_inter_rai_codebook),
  sheet = dict$sheet
) %>%
  mutate(
    across(where(is.character), ~ na_if(., dict$NA_char))
  ) %>%
  select(-created_at, -created_by, -updated_at, -updated_by)
InterRai_codebook <- rename(InterRai_codebook, all_of(c("id_interRai" = "id")))
InterRai_codebook <- bind_rows(
  InterRai_codebook %>%
    filter(type == "Select") %>%
    mutate(parsed = map(options, fromJSON)) %>%
    unnest(parsed),
  InterRai_codebook %>% filter(type != "Select")
) %>%
  arrange(display_order)

# interRai_first
InterRai_first_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_inter_rai_first_app),
  sheet = dict$sheet
) %>%
  select(-created_by, -updated_at, -updated_by, -id) %>%
  mutate(
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
InterRai_first_rename <- rename(
  InterRai_first_raw,
  all_of(dict$interRai_lookup)
)
InterRai_first_merged <- left_join(
  left_join(
    InterRai_first_rename,
    unique(
      InterRai_codebook %>% select(id_interRai, question, description, type)
    ),
    by = join_by(id_interRai == id_interRai)
  ),
  InterRai_codebook %>% select(id_interRai, label, value),
  by = join_by(id_interRai == id_interRai, answer == value)
) %>%
  mutate(answer = ifelse(type == "Select", label, answer)) %>%
  select(-label)


# interRai
InterRai_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_inter_rai),
  sheet = dict$sheet
) %>%
  select(-created_by, -updated_at, -updated_by, -id) %>%
  mutate(
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
InterRai_rename <- rename(InterRai_raw, all_of(dict$interRai_lookup))
InterRai_merged <- left_join(
  left_join(
    InterRai_rename,
    unique(
      InterRai_codebook %>% select(id_interRai, question, description, type)
    ),
    by = join_by(id_interRai == id_interRai)
  ),
  InterRai_codebook %>% select(id_interRai, label, value),
  by = join_by(id_interRai == id_interRai, answer == value)
) %>%
  mutate(answer = ifelse(type == "Select", label, answer)) %>%
  select(-label)

# Promis
Promis_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_promis),
  sheet = dict$sheet
) %>%
  select(-created_by, -updated_at, -updated_by, -id) %>%
  mutate(
    majority = ifelse(is_adult_suitable, "Adult", "Children"),
    across(where(is.character), ~ na_if(., dict$NA_char))
  ) %>%
  select(-is_adult_suitable, -is_child_suitable)
Promis_rename <- rename(Promis_raw, all_of(dict$promis_lookup))

# Text extractions
Text_extractions_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_text_extractions),
  sheet = dict$sheet
) %>%
  select(-customer_public_id, -id, -updated_at) %>%
  mutate(
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
Text_extractions_rename <- rename(
  Text_extractions_raw,
  all_of(dict$file_lookup)
)

# Diagnoses
Diagnoses_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_diagnoses),
  sheet = dict$sheet
) %>%
  select(-customer_public_id, -classification_model, -prompt_version) %>%
  mutate(
    # diagnosis_date = as.Date(
    #   parse_number(diagnosis_parsed_date, na = c("null", "NA", "")),
    #   origin = "1899-12-30"
    # ),
    diagnosis_date = diagnosis_parsed_date,
    across(where(is.character), ~ na_if(., dict$NA_char))
  ) %>%
  select(-diagnosis_parsed_date) %>%
  mutate(
    ICD_level_1 = substr(icd_code, 1, 1),
    ICD_level_2 = substr(icd_code, 1, 3),
  )
Diagnoses_rename <- rename(Diagnoses_raw, all_of(dict$file_lookup))

# Medication
Medications_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_medications),
  sheet = dict$sheet
) %>%
  select(-customer_public_id, -classification_model, -prompt_version) %>%
  mutate(
    prescription_date = as.Date(date, format = "%d.%m.%Y"),
    across(where(is.character), ~ na_if(., dict$NA_char))
  ) %>%
  select(-date) 
Medications_rename <- rename(Medications_raw, all_of(dict$file_lookup))

Appointments_raw <- read_xlsx(
  paste0(dict$data_path, dict$file_appointments),
  sheet = dict$sheet
) %>%
  select(-public_no, -created_at, -created_by, -updated_at, -updated_by, -public_id)%>%
  mutate(
    across(where(is.character), ~ na_if(., dict$NA_char))
  )
Appointments_rename <- rename(Appointments_raw, all_of(dict$appointments_lookup))

# House-keeping
all_objects <- ls(envir = .GlobalEnv)
raw_objects <- all_objects[grepl("_raw", all_objects)]
rm(list = raw_objects, envir = .GlobalEnv)
rm(all_objects, raw_objects)

# data checks ----------------

run_data_checks <- function() {
  data_check = list()

  cat("RESULTS OF DATA CHECKS\n-------------------------------------\n\n")

  # Caregivers
  cat("Duplicated caregivers id check: ")
  Duplicated_caregivers_id = Caregivers_rename[
    duplicated(Caregivers_rename$id_caregiver),
  ] %>% 
    arrange(id_caregiver)
  if (nrow(Duplicated_caregivers_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Duplicated_caregivers_id"]] <- Duplicated_caregivers_id
  }

  # Recipients
  cat("Duplicated recipients id check: ")
  Duplicated_recipients_id = Recipients_rename[
    duplicated(Recipients_rename$id_recipient),
  ] %>% 
    arrange(id_recipient)
  if (nrow(Duplicated_recipients_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Duplicated_recipients_id"]] <- Duplicated_recipients_id
  }
  
  cat("Recipients are of adult age: ")
  Nonadult_recipients_id = Recipients_rename[
    Recipients_rename$year_of_birth >=year(today())-17,
  ] %>% 
    arrange(id_recipient)
  if (nrow(Nonadult_recipients_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Nonadult_recipients_id"]] <- Nonadult_recipients_id
  }

  # Relationship
  cat("Duplicated relationship id check: ")
  Duplicated_relationship_id = Relationship_rename[
    duplicated(Relationship_rename$id_relationship),
  ] %>% 
    arrange(id_relationship)
  if (nrow(Duplicated_relationship_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Duplicated_relationship_id"]] <- Duplicated_relationship_id
  }

  cat("Relationship overlap check: ")
  Relationship_overlap = Relationship_rename %>%
    group_by(id_recipient) %>%
    mutate(
      active_to = if_else(
        is.na(active_to),
        as.Date("9999-12-31"),
        active_to
      )
    ) %>%
    arrange(id_recipient, active_from) %>%
    mutate(next_from = lead(active_from)) %>%
    mutate(overlap = next_from <= active_to) %>%
    summarise(has_overlap = any(overlap, na.rm = TRUE)) %>%
    filter(has_overlap == TRUE)
  if (nrow(Relationship_overlap) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Relationship_overlap"]] <- Relationship_overlap
  }
  
  cat("All recipients in relationship table: ")
  Recipient_IDs_not_in_relationship = Recipients_rename$id_recipient[
    !(Recipients_rename$id_recipient %in% Relationship_rename$id_recipient)
  ]
  if (length(Recipient_IDs_not_in_relationship) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Recipient_IDs_not_in_relationship"
    ]] <- Recipient_IDs_not_in_relationship
  }
  
  cat("All relationship_recip in recipients tablee: ")
  Relationship_IDs_not_in_recipients = Relationship_rename$id_recipient[
    !(Relationship_rename$id_recipient %in% Recipients_rename$id_recipient)
  ]
  if (length(Relationship_IDs_not_in_recipients) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Relationship_IDs_not_in_recipients"
    ]] <- Relationship_IDs_not_in_recipients
  }

  cat("All caregivers in relationship table: ")
  Caregivers_IDs_not_in_relationship = Caregivers_rename$id_caregiver[
    !(Caregivers_rename$id_caregiver %in% Relationship_rename$id_caregiver)
  ]
  if (length(Caregivers_IDs_not_in_relationship) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Caregivers_IDs_not_in_relationship"
    ]] = Caregivers_IDs_not_in_relationship
  }
  
  cat("All relationship_cg in caregiver table: ")
  Relationship_IDs_not_in_caregiver = Relationship_rename$id_caregiver[
    !(Relationship_rename$id_caregiver %in% Caregivers_rename$id_caregiver)
  ]
  if (length(Relationship_IDs_not_in_caregiver) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Relationship_IDs_not_in_caregiver"
    ]] = Relationship_IDs_not_in_caregiver
  }

  # Care_prescription
  Prescriptions_check_data = left_join(
    Care_prescription_rename,
    Recipients_rename,
    by = join_by(id_recipient == id_recipient)
  ) %>%
    mutate(
      city_match = city.x == city.y,
      surcharge_match = intensive_care_surcharge.x ==
        intensive_care_surcharge.y,
      spitex_match = is_other_spitex_involved.x == is_other_spitex_involved.y,
      dis_match = disability_compensation.x == disability_compensation.y
    ) %>%
    rename(
      c(
        "city_presc" = "city.x",
        "city_recip" = "city.y",
        "surcharge_presc" = "intensive_care_surcharge.x",
        "surcharge_recip" = "intensive_care_surcharge.y",
        "spitex_presc" = "is_other_spitex_involved.x",
        "spitex_recip" = "is_other_spitex_involved.y",
        "disability_presc" = "disability_compensation.x",
        "disability_recip" = "disability_compensation.y",
      )
    )

  Prescriptions_match_city <- Prescriptions_check_data %>%
    filter(!city_match) %>%
    select(id_recipient, city_presc, city_recip) %>%
    unique()

  cat("Recipient city matches value in prescriptions table: ")
  if (nrow(Prescriptions_match_city) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Prescriptions_match_city"]] = Prescriptions_match_city
  }

  Prescriptions_match_surcharge <- Prescriptions_check_data %>%
    filter(!surcharge_match) %>%
    select(id_recipient, surcharge_presc, surcharge_recip) %>%
    unique()

  cat("Recipient surcharge matches value in prescriptions table: ")
  if (nrow(Prescriptions_match_surcharge) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Prescriptions_match_surcharge"
    ]] = Prescriptions_match_surcharge
  }

  Prescriptions_match_spitex <- Prescriptions_check_data %>%
    filter(!spitex_match) %>%
    select(id_recipient, spitex_presc, spitex_recip) %>%
    unique()

  cat("Recipient Spitex matches value in prescriptions table: ")
  if (nrow(Prescriptions_match_spitex) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Prescriptions_match_spitex"]] = Prescriptions_match_spitex
  }

  Prescriptions_match_disability <- Prescriptions_check_data %>%
    filter(!dis_match) %>%
    select(id_recipient, disability_presc, disability_recip) %>%
    unique()

  cat(
    "Recipient disability compensation matches value in prescriptions table: "
  )
  if (nrow(Prescriptions_match_disability) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Prescriptions_match_disability"
    ]] = Prescriptions_match_disability
  }

  cat("Duplicated care_prescriptions id check: ")
  Duplicated_care_prescription_id = Care_prescription_rename[
    duplicated(Care_prescription_rename$id_care_prescription),
  ]
  if (nrow(Duplicated_care_prescription_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Duplicated_care_prescription_id"
    ]] <- Duplicated_care_prescription_id
  }

  cat("Every recipient has a care prescription: ")
  Recipients_without_care_prescription = Recipients_rename$id_recipient[
    !(Recipients_rename$id_recipient %in% Care_prescription_rename$id_recipient)
  ]
  if (length(Recipients_without_care_prescription) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Recipients_without_care_prescription"
    ]] <- Recipients_without_care_prescription
  }

  # Tasks
  cat("Duplicated tast id check: ")
  Duplicated_task_id = Tasks_rename[duplicated(Tasks_rename$id_task), ]
  if (nrow(Duplicated_task_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Duplicated_task_id"]] <- Duplicated_task_id
  }

  # InterRaifirst

  Multiple_interRai_first = InterRai_first_rename %>%
    group_by(id_recipient) %>%
    filter(length(unique(id_appointment)) > 1) %>%
    select(id_recipient, id_appointment, id_interRai, created_at) %>%
    unique() %>%
    group_by(id_recipient, id_appointment, created_at) %>%
    summarize(n_interRai_questions = n(), .groups = "drop") %>%
    arrange(id_recipient, created_at)

  cat("All interRai_first entries are unique: ")
  if (nrow(Multiple_interRai_first) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Multiple_interRai_first"]] <- Multiple_interRai_first
  }

  Incomplete_interRai_first = InterRai_first_rename %>%
    group_by(id_appointment, created_at) %>%
    filter(!(n() %in% c(18, 20, 24)))

  cat("All interRai_first entries are complete: ")
  if (nrow(Incomplete_interRai_first) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Incomplete_interRai_first"]] <- Incomplete_interRai_first
  }

  Recipients_without_interRai_first = Recipients_rename$id_recipient[
    !(Recipients_rename$id_recipient %in% InterRai_first_rename$id_recipient)
  ]

  cat("Every Recipient has an interRai_first entry: ")
  if (length(Recipients_without_interRai_first) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Recipients_without_interRai_first"
    ]] <- Recipients_without_interRai_first
  }

  # InterRai
  Incomplete_interRai_assessments = InterRai_rename %>%
    group_by(id_appointment) %>%
    summarize(
      n_interRai_questions = length(unique(id_interRai)),
      .groups = "drop"
    ) %>%
    filter(n_interRai_questions < 214)

  cat("All InterRai assessment are complete: ")
  if (nrow(Incomplete_interRai_assessments) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Incomplete_interRai_assessments"
    ]] <- Incomplete_interRai_assessments
  }

  Duplicate_interRai_questions = InterRai_rename %>%
    group_by(id_appointment, id_interRai) %>%
    filter(n() > 1) %>%
    ungroup()

  cat("No Duplicate questions in InterRai assessment: ")
  if (nrow(Duplicate_interRai_questions) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Duplicate_interRai_questions"]] <- Duplicate_interRai_questions
  }

  # Text extraction

  Text_extraction_file_is_not_in_diagnosis_table <- Text_extractions_rename$id_file[
    !(Text_extractions_rename$id_file %in% Diagnoses_rename$id_file)
  ]
  cat("Every Text extraction file is in diagnosis table: ")
  if (length(Text_extraction_file_is_not_in_diagnosis_table) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Text_extraction_file_is_not_in_diagnosis_table"
    ]] <- Text_extraction_file_is_not_in_diagnosis_table
  }

  Diagnosis_file_is_not_in_text_extraction_table <- Diagnoses_rename$id_file[
    !(Diagnoses_rename$id_file %in% Text_extractions_rename$id_file)
  ]
  cat("Every Diagnosis file is in text extractions table: ")
  if (length(Diagnosis_file_is_not_in_text_extraction_table) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Diagnosis_file_is_not_in_text_extraction_table"
    ]] <- Diagnosis_file_is_not_in_text_extraction_table
  }

  # Medication
  Text_extraction_file_is_not_in_medications_table <- Text_extractions_rename$id_file[
    !(Text_extractions_rename$id_file %in% Medications_rename$id_file)
  ]
  cat("Every Text extraction file is in medications table: ")
  if (length(Text_extraction_file_is_not_in_medications_table) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Text_extraction_file_is_not_in_medications_table"
    ]] <- Text_extraction_file_is_not_in_medications_table
  }

  Medications_file_is_not_in_text_extraction_table <- Medications_rename$id_file[
    !(Medications_rename$id_file %in% Text_extractions_rename$id_file)
  ]
  cat("Every Medication file is in text extractions table: ")
  if (length(Medications_file_is_not_in_text_extraction_table) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Medications_file_is_not_in_text_extraction_table"
    ]] <- Medications_file_is_not_in_text_extraction_table
  }

  Diagnosis_file_is_not_in_medications_table <- Diagnoses_rename$id_file[
    !(Diagnoses_rename$id_file %in% Medications_rename$id_file)
  ]
  cat("Every Diagnosis file is in medications table: ")
  if (length(Diagnosis_file_is_not_in_medications_table) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Diagnosis_file_is_not_in_medications_table"
    ]] <- Diagnosis_file_is_not_in_medications_table
  }

  Medications_file_is_not_in_diagnosis_table <- Medications_rename$id_file[
    !(Medications_rename$id_file %in% Diagnoses_rename$id_file)
  ]
  cat("Every Medication file is in diagnosis table: ")
  if (length(Medications_file_is_not_in_diagnosis_table) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[[
      "Medications_file_is_not_in_diagnosis_table"
    ]] <- Medications_file_is_not_in_diagnosis_table
  }
  
  # Appointments
  cat("Duplicated appointment id check: ")
  Duplicated_appointments_id = Appointments_rename[
    duplicated(Appointments_rename$id_appointment),
  ] %>% 
    arrange(id_appointment)
  if (nrow(Duplicated_appointments_id) == 0) {
    cat("passed\n")
  } else {
    cat("failed\n")
    data_check[["Duplicated_appointments_id"]] <- Duplicated_appointments_id
  }
  
  return(data_check)
}

data_check <- run_data_checks()

lapply(names(data_check), function(name) {
  write.csv(data_check[[name]], file = paste0(name, ".csv"), row.names = FALSE)
})


# Descriptive plots -------------

# Caregivers
Caregivers_summary_table <- create_summary(
  Caregivers_rename
)
desc_plot_bar(
  plot_data = "Caregivers",
  plot_variable = "gender",
  variable_label = "Gender"
)
desc_plot_bar(
  plot_data = "Caregivers",
  plot_variable = "has_second_job",
  variable_label = "Has second job"
)
desc_plot_bar(
  plot_data = "Caregivers",
  plot_variable = "is_tariff_b_qualified",
  variable_label = "Is tariff B qualified"
)
desc_plot_bar(
  plot_data = "Caregivers",
  plot_variable = "marital_status",
  variable_label = "Marital status"
)
desc_plot_col(
  plot_data = "Caregivers",
  plot_variable = "year_of_birth",
  variable_label = "Year of birth"
)

# Recipients
Recipients_summary_table <- create_summary(
  Recipients_rename %>% select(-city, -zip)
)
desc_plot_bar(
  plot_data = "Recipients",
  plot_variable = "gender",
  variable_label = "Gender"
)
desc_plot_bar(
  plot_data = "Recipients",
  plot_variable = "disability_compensation",
  variable_label = "Disability compensation"
)
desc_plot_bar(
  plot_data = "Recipients",
  plot_variable = "intensive_care_surcharge",
  variable_label = "IC surcharge"
)
desc_plot_bar(
  plot_data = "Recipients",
  plot_variable = "is_other_spitex_involved",
  variable_label = "Other Spitex involved"
)
desc_plot_col(
  plot_data = "Recipients",
  plot_variable = "year_of_birth",
  variable_label = "Year of birth"
)

# Relationships
Relationship_summary_table <- create_summary(
  Relationship_rename %>% select(role)
)
desc_plot_bar(
  plot_data = "Relationship",
  plot_variable = "role",
  variable_label = "Caregiver relationship"
)

# Care_prescriptions
Care_prescription_summary_table <- create_summary(
  Care_prescription_rename %>% select(status, estimated_recurring_monthly_hours)
)
desc_plot_bar(
  plot_data = "Care_prescription",
  plot_variable = "status",
  variable_label = "Status"
)
desc_plot_col(
  plot_data = "Care_prescription",
  plot_variable = "estimated_recurring_monthly_hours",
  variable_label = "Estimated monthly hours"
)

# Tasks
Tasks_summary_table <- create_summary(
  Tasks_rename
)
desc_plot_col(
  plot_data = "Tasks",
  plot_variable = "task_confidence_scale",
  variable_label = "Confidence",
  ticks = seq(from = 0, to = 10, by = 2)
)
desc_plot_col(
  plot_data = "Tasks",
  plot_variable = "documentation_date",
  variable_label = "Date"
)
desc_plot_bar(
  plot_data = "Tasks",
  plot_variable = "customer_mood",
  variable_label = "Recipient mood"
)

# Combined demographics
Demographics <- bind_rows(
  Caregivers_rename %>%
    select(gender, year_of_birth) %>%
    mutate(type = "Caregiver"),
  Recipients_rename %>%
    select(gender, year_of_birth) %>%
    mutate(type = "Patient"),
)
Demographics_summary_table <- create_summary(
  Demographics,
  group_column = "type"
)
desc_plot_bar(
  plot_data = "Demographics",
  plot_variable = "gender",
  variable_label = "Gender",
  group_column = "type"
)
desc_plot_col(
  plot_data = "Demographics",
  plot_variable = "year_of_birth",
  variable_label = "Year of birth",
  group_column = "type"
)

# House-keeping
all_objects <- ls(envir = .GlobalEnv)
raw_objects <- all_objects[grepl("_summary_table", all_objects)]
rm(list = raw_objects, envir = .GlobalEnv)
rm(all_objects, raw_objects)

monthly_hours = left_join(
  Care_prescription_rename %>% select(id_recipient, estimated_recurring_monthly_hours),
  Recipients_rename %>% select(id_recipient, gender, year_of_birth),
  by = join_by(id_recipient)) %>%
  mutate(gender = factor(gender, levels = c("Female","Male"), ordered = T),
         age_group = factor(case_when(
           (year(today()) - year_of_birth) < 40 ~ "<40",
           (year(today()) - year_of_birth) >= 65 ~">=65",
           .default = "40-64"
          ),
          levels = c("<40","40-64",">=65")))

p1 <- ggplot(monthly_hours, aes(x = "All", y = estimated_recurring_monthly_hours)) +
  geom_boxplot(fill = get_viridis_color(1)) +
  labs(x = NULL, y = "Estimated monthly care [hours]", title = "Overall sample") +
  theme_settings +
  theme(plot.title = element_text(hjust = 0.5))

# Plot 2: By gender (remove y-axis)
p2 <- ggplot(monthly_hours, aes(x = gender, y = estimated_recurring_monthly_hours, fill = gender)) +
  geom_boxplot() +
  scale_fill_manual(values = get_viridis_color(2)) +
  labs(x = NULL, y = NULL, title = "By gender") +
  theme_settings +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank())

# Plot 3: By age group (remove y-axis)
p3 <- ggplot(monthly_hours, aes(x = age_group, y = estimated_recurring_monthly_hours, fill = age_group)) +
  geom_boxplot() +
  scale_fill_manual(values = get_viridis_color(3)) +
  labs(x = NULL, y = NULL, title = "By age group") +
  theme_settings +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank())

# Combine plots horizontally with same y-axis scale
combined_plot <- p1 + p2 + p3 + plot_layout(ncol = 3)

# Display the combined plot
combined_plot


# Diagnosis plots -------------------

cd_Diagnoses_rename <- read_xlsx(
  paste0(dict$data_path, dict$file_diagnoses),
  sheet = dict$sheet
) %>%
  select(-customer_public_id, -classification_model, -prompt_version) %>%
  mutate(
    # diagnosis_date = as.Date(
    #   parse_number(diagnosis_parsed_date, na = c("null", "NA", "")),
    #   origin = "1899-12-30"
    # )
  ) %>%
  select(-diagnosis_parsed_date) %>%
  mutate(across(
    all_of(c(
      "diagnosis",
      "diagnosis_category",
      "diagnosis_status",
      "diagnosis_source",
      "icd_code",
      "icd_description"
    )),
    ~ na_if(., "null")
  )) %>%
  mutate(
    ICD_level_1 = substr(icd_code, 1, 1),
    ICD_level_2 = substr(icd_code, 1, 3),
  ) %>%
  rename(all_of(dict$file_lookup))

# Load ICD description
cd_ICD_chapters = read_delim(
  "./icd102019syst_chapters.txt",
  delim = ";",
  col_names = FALSE
)
colnames(cd_ICD_chapters) = c("chapter_code", "chapter_label")

cd_ICD_chapters = cd_ICD_chapters %>%
  mutate(chapter_label = paste0(chapter_code, " - ", chapter_label)) %>%
  mutate(
    chapter_label = ifelse(
      str_length(chapter_label) > 60,
      str_replace(chapter_label, "(.{1,55})\\s", "\\1\n"),
      chapter_label
    )
  )

cd_ICD_groups = read_delim("./icd102019syst_groups.txt", ";", col_names = FALSE)

cd_ICD_groups <- cd_ICD_groups %>%
  mutate(
    group_code = paste(X1, substr(X2, 2, 3), sep = "-"),
    letter = substr(X1, 1, 1),
    start = as.integer(substr(X1, 2, 3)),
    end = as.integer(substr(X2, 2, 3)),
    code = map2(
      letter,
      map2(start, end, seq),
      ~ paste0(.x, sprintf("%02d", .y))
    )
  ) %>%
  select(code, X3, X4, group_code) %>%
  unnest(code) %>%
  filter(code %in% cd_Diagnoses_rename$ICD_level_2)

colnames(cd_ICD_groups) = c(
  "ICD_level_2",
  "chapter_code",
  "group_label",
  "group_code"
)


# create a data frame giving the hierarchical structure of your individuals.

cd_hierarchy <- bind_rows(
  data.frame(
    "from" = rep("Origin", nrow(cd_ICD_chapters)),
    "to" = cd_ICD_chapters$chapter_code,
    "level" = 1
  ),
  data.frame(
    "from" = rep("Origin", nrow(cd_ICD_groups)),
    "to" = cd_ICD_groups$group_code,
    "level" = 2
  )
) %>%
  unique()

# create a vertices data.frame. One line per object of our hierarchy, giving features of nodes.
cd_vertices <- data.frame(
  name = unique(c(
    as.character(cd_hierarchy$from),
    as.character(cd_hierarchy$to)
  ))
) %>%
  mutate(
    level = case_when(
      name == "Origin" ~ 0,
      str_length(name) == 2 ~ 1,
      .default = 2
    )
  )

cd_vertices <- left_join(
  left_join(
    cd_vertices,
    cd_ICD_chapters,
    by = join_by(name == chapter_code)
  ),
  cd_ICD_groups %>% select(group_code, group_label) %>% unique(),
  by = join_by(name == group_code)
) %>%
  mutate(
    vert_label = case_when(
      level == 1 ~ chapter_label,
      level == 2 ~ group_label,
      .default = "Origin"
    )
  ) %>%
  select(-group_label, -chapter_label)

cd_vertices = left_join(
  left_join(
    cd_vertices,
    cd_ICD_groups %>% select(-ICD_level_2) %>% unique(),
    by = join_by(name == group_code)
  ),
  cd_ICD_chapters,
  by = join_by(chapter_code == chapter_code)
) %>%
  mutate(
    chapter_code = ifelse(level == 2, chapter_code, name),
    chapter_label = ifelse(level == 2, chapter_label, vert_label)
  ) %>%
  select(-group_label)

n_chapters <- length(unique(cd_vertices$chapter_code)) - 1
range_chapters <- (dict$colour_end - dict$colour_start) / (n_chapters - 1) / 3

cd_vertices = bind_rows(
  cd_vertices %>%
    filter(level == 0) %>%
    mutate(
      vert_colour = mako(n = n(), begin = 0.5, end = 0.5)
    ),
  cd_vertices %>%
    filter(level == 1) %>%
    mutate(
      vert_colour = mako(
        n = n(),
        begin = dict$colour_start,
        end = dict$colour_end
      )
    ),
  cd_vertices %>%
    filter(level == 2) %>%
    mutate(
      mid = dict$colour_start +
        (dict$colour_end - dict$colour_start) *
          ((as.numeric(chapter_code) - 1) / (n_chapters - 1)),
    ) %>%
    group_by(chapter_code) %>%
    mutate(
      vert_colour = mako(
        n = n(),
        begin = min(mid) - range_chapters,
        end = min(mid) + range_chapters
      )
    ) %>%
    select(-mid)
)

cd_Diagnoses_join = left_join(
  cd_Diagnoses_rename,
  cd_ICD_groups,
  by = join_by(ICD_level_2 == ICD_level_2)
)
n_recipients = length(unique(
  cd_Diagnoses_join %>% filter(has_diagnosis) %>% pull(id_recipient)
))
cd_vertices = left_join(
  left_join(
    cd_vertices,
    cd_Diagnoses_join %>%
      group_by(group_code) %>%
      summarize(count = length(unique(id_recipient))) %>%
      na.omit() %>%
      ungroup() %>%
      mutate(
        share = count / sum(count),
        prev = count / n_recipients
      ),
    by = join_by(name == group_code)
  ),
  cd_Diagnoses_join %>%
    group_by(chapter_code) %>%
    summarize(count = length(unique(id_recipient))) %>%
    na.omit() %>%
    ungroup() %>%
    mutate(
      share = count / sum(count),
      prev = count / n_recipients
    ),
  by = join_by(name == chapter_code)
) %>%
  mutate(
    count = case_when(
      level == 2 ~ count.x,
      level == 1 ~ count.y,
      .default = 0
    ),
    prev = case_when(
      level == 2 ~ prev.x,
      level == 1 ~ prev.y,
      .default = 0
    ),
    share = case_when(
      level == 2 ~ share.x,
      level == 1 ~ share.y,
      .default = 0
    )
  ) %>%
  select(-count.x, -count.y, -prev.x, -prev.y, -share.x, -share.y) %>%
  group_by(chapter_code) %>%
  mutate(share_2 = sum((level == 2) * share)) %>%
  ungroup()

# create connection data
cd_connect <- bind_rows(
  data.frame(
    cd_Diagnoses_join %>%
      select(id_recipient, group_code) %>%
      na.omit() %>%
      unique() %>%
      group_by(id_recipient) %>%
      mutate(n_diagnosis = n()) %>%
      filter(n_diagnosis > 1) %>%
      summarise(
        combinations = list(t(combn(group_code, 2))),
        .groups = "drop"
      ) %>%
      unnest_longer(combinations) %>%
      unique() %>%
      pull(combinations),
    "level" = 2
  ) %>%
    mutate(
      "from" = pmin(X1, X2),
      "to" = pmax(X1, X2)
    ) %>%
    group_by(from, to, level) %>%
    summarize(count = n(), .groups = "drop") %>%
    ungroup() %>%
    mutate(
      share = count / sum(count),
      prev = count / n_recipients
    ),
  data.frame(
    cd_Diagnoses_join %>%
      select(id_recipient, chapter_code) %>%
      na.omit() %>%
      unique() %>%
      group_by(id_recipient) %>%
      mutate(n_diagnosis = n()) %>%
      filter(n_diagnosis > 1) %>%
      summarise(
        combinations = list(t(combn(chapter_code, 2))),
        .groups = "drop"
      ) %>%
      unnest_longer(combinations) %>%
      unique() %>%
      pull(combinations),
    "level" = 1
  ) %>%
    mutate(
      "from" = pmin(X1, X2),
      "to" = pmax(X1, X2)
    ) %>%
    group_by(from, to, level) %>%
    summarize(count = n(), .groups = "drop") %>%
    ungroup() %>%
    mutate(
      share = count / sum(count),
      prev = count / n_recipients
    )
)

cd_connect = left_join(
  left_join(
    cd_connect,
    cd_vertices %>%
      select(name, prev),
    by = join_by(from == name)
  ),
  cd_vertices %>% select(name, prev),
  by = join_by(to == name)
) %>%
  group_by(level) %>%
  mutate(factor = log(prev.x / (prev.y * prev))) %>%
  mutate(factor = (factor / median(factor) - 1)) %>%
  mutate(factor = factor * prev.x^0.8) %>%
  ungroup() %>%
  select(-prev.y, -prev) %>%
  rename(c("prev" = "prev.x")) %>%
  arrange(abs(factor))
cd_factor_lm = lm("factor ~ level*share", cd_connect)
cd_connect$factor = cd_factor_lm$residuals

cd_connect <- cd_connect %>%
  group_by(level) %>%
  mutate(
    factor = ifelse(factor < 0, -factor / min(factor), factor / max(factor))
  ) %>%
  ungroup() %>%
  arrange(level, abs(factor))

codiag_plot_1 <- draw_plot(
  set_level = 1,
  dict = dict,
  vertices = cd_vertices,
  connect = cd_connect,
  hierarchy = cd_hierarchy
)

codiag_plot_2 <- draw_plot(
  set_level = 2,
  dict = dict,
  vertices = cd_vertices,
  connect = cd_connect,
  hierarchy = cd_hierarchy
)

# descriptive plot of chapter prevalence

Diagnosis_summary_table <- cd_vertices %>%
  filter(level == 1) %>%
  mutate(
    group_column = "1",
    variable = "ICD10",
    count = 0
  ) %>%
  rename(
    c(
      "value" = "chapter_label",
      "percentage" = "prev"
    )
  ) %>%
  select(group_column, variable, value, percentage, count)

desc_plot_bar(
  plot_data = "Diagnosis",
  plot_variable = "ICD10",
  variable_label = "ICD-10 chapter",
  plot_width = 15,
  plot_height = 17
)

# House-keeping
all_objects <- ls(envir = .GlobalEnv)
cd_objects <- all_objects[grepl("cd_", all_objects)]
rm(list = cd_objects, envir = .GlobalEnv)
rm(all_objects, cd_objects, n_chapters, n_recipients, range_chapters)


# Descriptive table ---------------

Recipients_ids_to_exclude = Recipients_rename %>% filter(year_of_birth >=year(today())-17) %>% pull(id_recipient)
Caregivers_ids_to_exclude = Caregivers_rename %>% filter(year_of_birth >=year(today())-17) %>% pull(id_caregiver)
Caregivers_ids_to_exclude_2 = c(Caregivers_ids_to_exclude, Relationship_rename %>% filter(id_recipient %in% Recipients_ids_to_exclude) %>% pull(id_caregiver))
Recipients_ids_to_exclude_2 = c(Recipients_ids_to_exclude, Relationship_rename %>% filter(id_caregiver %in% Caregivers_ids_to_exclude) %>% pull(id_recipient))
Recipients_rename = Recipients_rename %>% filter(!(id_recipient %in% Recipients_ids_to_exclude_2))
Caregivers_rename = Caregivers_rename %>% filter(!(id_caregiver %in% Caregivers_ids_to_exclude_2))


char_table = data.frame(
  "Characteristic" = c(
    "Characteristics",
    #"Age, mean +/- SD (median)",
    "Age, median (IQR, 5%/95% quantile)",
    "Age group, n (%)",
    "<40 years",
    "40-64 years",
    "≥65 years",
    "Female sex, n (%)",
    "Nationality (Swiss), n (%)",
    "Tariff B qualification, n (%)",
    "Has primary job, n (%)"
  ),
  "Patients" = c(
    paste0("Patients (n= ", nrow(Recipients_rename), ")"),
    # paste0(
    #   round(
    #     year(today()) - mean(Recipients_rename$year_of_birth, na.rm = TRUE),
    #     1
    #   ),
    #   " ± ",
    #   round(sd(Recipients_rename$year_of_birth, na.rm = TRUE), 1),
    #   " (",
    #   year(today()) - median(Recipients_rename$year_of_birth, na.rm = TRUE),
    #   ")"
    # ),
    paste0(
      year(today()) - median(Recipients_rename$year_of_birth, na.rm = TRUE),
      " (",
      IQR(Recipients_rename$year_of_birth, na.rm = TRUE),
      ", ",
      round(year(today()) - quantile(Recipients_rename$year_of_birth, probs = c(0.95), na.rm = TRUE),0),
      "/",
      round(year(today()) - quantile(Recipients_rename$year_of_birth, probs = c(0.05), na.rm = TRUE),0),
      ")"
    ),
    "",
    paste0(
      sum((year(today()) - Recipients_rename$year_of_birth) < 40, na.rm = T),
      " (",
      round(
        mean(
          (year(today()) - Recipients_rename$year_of_birth) < 40,
          na.rm = TRUE
        ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(
        (year(today()) - Recipients_rename$year_of_birth) %in% seq(40, 64),
        na.rm = T
      ),
      " (",
      round(
        mean(
          (year(today()) - Recipients_rename$year_of_birth) %in% seq(40, 64),
          na.rm = TRUE
        ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum((year(today()) - Recipients_rename$year_of_birth) >= 65, na.rm = T),
      " (",
      round(
        mean(
          (year(today()) - Recipients_rename$year_of_birth) >= 65,
          na.rm = TRUE
        ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(Recipients_rename$gender == "Female", na.rm = T),
      " (",
      round(
        sum(Recipients_rename$gender == "Female", na.rm = T) /
          sum(Recipients_rename$gender %in% c("Female", "Male"), na.rm = T) *
          100,
        0
      ),
      "%)"
    ),
    "",
    "",
    ""
  ),
  "Caregivers" = c(
    paste0("Caregivers (n= ", nrow(Caregivers_rename), ")"),
    # paste0(
    #   round(
    #     year(today()) - mean(Caregivers_rename$year_of_birth, na.rm = TRUE),
    #     1
    #   ),
    #   " ± ",
    #   round(sd(Caregivers_rename$year_of_birth, na.rm = TRUE), 1),
    #   " (",
    #   year(today()) - median(Caregivers_rename$year_of_birth, na.rm = TRUE),
    #   ")"
    # ),
    paste0(
      year(today()) - median(Caregivers_rename$year_of_birth, na.rm = TRUE),
      " (",
      IQR(Caregivers_rename$year_of_birth, na.rm = TRUE),
      ", ",
      round(year(today()) - quantile(Caregivers_rename$year_of_birth, probs = c(0.95), na.rm = TRUE),0),
      "/",
      round(year(today()) - quantile(Caregivers_rename$year_of_birth, probs = c(0.05), na.rm = TRUE),0),
      ")"
    ),
    "",
    paste0(
      sum((year(today()) - Caregivers_rename$year_of_birth) < 40, na.rm = T),
      " (",
      round(
        mean(
          (year(today()) - Caregivers_rename$year_of_birth) < 40,
          na.rm = TRUE
        ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(
        (year(today()) - Caregivers_rename$year_of_birth) %in% seq(40, 64),
        na.rm = T
      ),
      " (",
      round(
        mean(
          (year(today()) - Caregivers_rename$year_of_birth) %in% seq(40, 64),
          na.rm = TRUE
        ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum((year(today()) - Caregivers_rename$year_of_birth) >= 65, na.rm = T),
      " (",
      round(
        mean(
          (year(today()) - Caregivers_rename$year_of_birth) >= 65,
          na.rm = TRUE
        ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(Caregivers_rename$gender == "Female", na.rm = T),
      " (",
      round(
        sum(Caregivers_rename$gender == "Female", na.rm = T) /
          sum(Caregivers_rename$gender %in% c("Female", "Male"), na.rm = T) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(Caregivers_rename$nationality == "CH", na.rm = T),
      " (",
      round(
        sum(Caregivers_rename$nationality == "CH", na.rm = T) /
          sum(Caregivers_rename$nationality != "null", na.rm = T) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(Caregivers_rename$is_tariff_b_qualified, na.rm = T),
      " (",
      round(
        sum(Caregivers_rename$is_tariff_b_qualified, na.rm = T) /
          sum(
            Caregivers_rename$is_tariff_b_qualified %in% c(T,F),
            na.rm = T
          ) *
          100,
        0
      ),
      "%)"
    ),
    paste0(
      sum(Caregivers_rename$has_second_job, na.rm = T),
      " (",
      round(
        sum(Caregivers_rename$has_second_job, na.rm = T) /
          sum(Caregivers_rename$has_second_job %in% c(T,F), na.rm = T) *
          100,
        0
      ),
      "%)"
    )
  )
)


# Calculation of key stats ------------
key_stats <- function(){
  # Recruiting
  cat(
    "Number of recruited patients: ",
    nrow(Recipients_rename),
    "\n"
  )
  cat(
    "Number of recruited caregivers: ",
    nrow(Caregivers_rename),
    "\n"
  )
  rec_start_dat = "2025-07-18"
  rec_end_date = "2025-09-30"
  cat(
    "Number of recruited patients per week: ",
    round(
      nrow(Recipients_rename) /
        as.numeric(difftime(rec_end_date, rec_start_dat, units = "weeks"))
    ),
    "\n"
  )
  
  cat(
    "Number of recruited caregivers per week: ",
    round(
      nrow(Caregivers_rename) /
        as.numeric(difftime(rec_end_date, rec_start_dat, units = "weeks"))
    ),
    "\n"
  )
  
  as.numeric(difftime(rec_end_date, rec_start_dat, units = "weeks"))
  
  
  # Age
  cat(
    "Mean patient age: ",
    round(year(today()) - mean(Recipients_rename$year_of_birth, na.rm = TRUE),1),
    "\n"
  )
  cat(
    "Median patient age: ",
    year(today()) - median(Recipients_rename$year_of_birth, na.rm = TRUE),
    "\n"
  )
  cat(
    "sd patient age: ", 
    round(sd(Recipients_rename$year_of_birth, na.rm = TRUE),1),
    "\n"
  )
  cat(
    "IQR patient age: ", 
    round(IQR(Recipients_rename$year_of_birth, na.rm = TRUE),0),
    "\n"
  )
  cat(
    "5%/95% quantile patient age: ", 
    round(year(today()) - quantile(Recipients_rename$year_of_birth, probs = c(0.95, 0.05), na.rm = TRUE),0),
    "\n"
  )
  
  cat(
    "Mean caregiver age: ",
    round(year(today()) - mean(Caregivers_rename$year_of_birth, na.rm = TRUE),1),
    "\n"
  )
  cat(
    "Median caregiver age: ",
    year(today()) - median(Caregivers_rename$year_of_birth, na.rm = TRUE),
    "\n"
  )
  cat(
    "sd caregiver age: ", 
    round(sd(Caregivers_rename$year_of_birth, na.rm = TRUE),1),
    "\n"
    )
  cat(
    "IQR caregiver age: ", 
    round(IQR(Caregivers_rename$year_of_birth, na.rm = TRUE),0),
    "\n"
  )
  cat(
    "5%/95% quantile caregiver age: ", 
    round(year(today()) - quantile(Caregivers_rename$year_of_birth, probs = c(0.95, 0.05), na.rm = TRUE),0),
    "\n"
  )
  
  
  # Monthly hours
  cat(
    "Mean of estimated monthly care needs: ",
    mean(Care_prescription_rename$estimated_recurring_monthly_hours),
    "\n"
  )
  cat(
    "Median of estimated monthly care needs: ",
    median(Care_prescription_rename$estimated_recurring_monthly_hours),
    "\n"
  )
  cat(
    "Sd of estimated monthly care needs: ",
    sd(Care_prescription_rename$estimated_recurring_monthly_hours),
    "\n"
  )
  
  # Diagnostics
  cat(
    "Number of patients with diagnosis: ",
    length(
      Diagnoses_rename %>%
        filter(has_diagnosis) %>%
        pull(id_recipient) %>%
        unique()
    ),
    "\n"
  )
  cat(
    "Share of patients with diagnosis: ",
    round(
      length(
        Diagnoses_rename %>%
          filter(has_diagnosis) %>%
          pull(id_recipient) %>%
          unique()
      ) /
        nrow(Recipients_rename) *
        100,
      1
    ),
    "%",
    "\n"
  )
  diag_per_patient = Diagnoses_rename %>%
    filter(has_diagnosis) %>%
    select(id_recipient, ICD_level_2) %>%
    unique() %>%
    group_by(id_recipient) %>%
    summarize(n_diag = n()) %>%
    pull(n_diag)
  cat(
    "Median number of diagnoses per patient: ",
    median(diag_per_patient),
    "\n"
  )
  cat(
    "Median number of diagnoses per patient: ",
    mean(diag_per_patient),
    "\n"
  )
  cat(
    "IQR of diagnoses per patient: ",
    round(
      sd(diag_per_patient),
      1
    ),
    "\n"
  )
  
  mean(Care_prescription_rename$estimated_recurring_monthly_hours*12/365<2, na.rm = T)
}

key_stats()


# PROMIS analysis

Promis_analysis = Promis_rename %>% 
  filter(majority=="Adult") %>%
  arrange(created_at) %>%
  group_by(id_recipient, id_promis) %>% 
  slice(n()) %>%
  ungroup() %>% 
  select(id_recipient, id_promis, category_id, answer, question) %>%
  group_by(id_recipient) %>%
  filter(n()==29) %>%
  ungroup() %>%
  mutate(category_id = ifelse(id_promis == "Global07","pain_intensity",category_id))

cat_1 = c("socialRoleParticipation")
cat_2 = c("sleepDisturbance","fatigue","pain")
cat_3 = c("physicalFunctioning")
cat_4 = c("anxiety","depression")
Promis_analysis = Promis_analysis %>%
  mutate(
    answer_score = case_when(
      category_id %in% cat_1 & answer=="Immer" ~ 1,
      category_id %in% cat_1 & answer=="Oft" ~ 2,
      category_id %in% cat_1 & answer=="Manchmal" ~ 3,
      category_id %in% cat_1 & answer=="Selten" ~ 4,
      category_id %in% cat_1 & answer=="Nie" ~ 5,
      category_id %in% cat_2 & answer=="Überhaupt nicht" ~ 1,
      category_id %in% cat_2 & answer=="Ein wenig" ~ 2,
      category_id %in% cat_2 & answer=="Mäßig" ~ 3,
      category_id %in% cat_2 & answer=="Ziemlich" ~ 4,
      category_id %in% cat_2 & answer=="Sehr" ~ 5,
      category_id %in% cat_3 & answer=="Kann ich gar nicht" ~ 1,
      category_id %in% cat_3 & answer=="Mit großen Schwierigkeiten" ~ 2,
      category_id %in% cat_3 & answer=="Mit einigen Schwierigkeiten" ~ 3,
      category_id %in% cat_3 & answer=="Mit geringen Schwierigkeiten" ~ 4,
      category_id %in% cat_3 & answer=="Ohne jede Schwierigkeiten" ~ 5,
      category_id %in% cat_4 & answer=="Nie" ~ 1,
      category_id %in% cat_4 & answer=="Selten" ~ 2,
      category_id %in% cat_4 & answer=="Manchmal" ~ 3,
      category_id %in% cat_4 & answer=="Oft" ~ 4,
      category_id %in% cat_4 & answer=="Immer" ~ 5,
      category_id == "pain_intensity" & answer=="Überhaupt nicht" ~ 1,
      category_id == "pain_intensity" & answer=="Ein wenig" ~ 3,
      category_id == "pain_intensity" & answer=="Mäßig" ~ 5,
      category_id == "pain_intensity" & answer=="Ziemlich" ~ 7,
      category_id == "pain_intensity" & answer=="Sehr" ~ 9,
      .default = NA
    )
  ) %>% 
  mutate(answer_score = ifelse(id_promis %in% c("Sleep109","Sleep116"),6-answer_score,answer_score))

Promis_analysis = Promis_analysis %>%
  group_by(id_recipient, category_id) %>%
  summarize(category_score = sum(answer_score), .groups = "drop")

Promis_lookup =
  data.frame(
    "category_id" = rep(c("physicalFunctioning","anxiety","depression","fatigue","sleepDisturbance","socialRoleParticipation","pain"),each = 17),
    "category_score"=rep(4:20,7),
    "t_score" = c(22.5, 26.6, 28.9, 30.5, 31.9, 33.2, 34.4, 35.6, 36.7, 37.9, 39.2, 40.5, 41.9, 43.5, 45.5, 48.3, 57.0,
                  40.3, 48.0, 51.2, 53.7, 55.8, 57.7, 59.5, 61.4, 63.4, 65.3, 67.3, 69.3, 71.2, 73.3, 75.4, 77.9, 81.6,
                  41.0, 49.0, 51.8, 53.9, 55.7, 57.3, 58.9, 60.5, 62.2, 63.9, 65.7, 67.5, 69.4, 71.2, 73.3, 75.7, 79.4,
                  33.7, 39.7, 43.1, 46.0, 48.6, 51.0, 53.1, 55.1, 57.0, 58.8, 60.7, 62.7, 64.6, 66.7, 69.0, 71.6, 75.8,
                  32.0, 37.5, 41.1, 43.8, 46.2, 48.4, 50.5, 52.4, 54.3, 56.1, 57.9, 59.8, 61.7, 63.8, 66.0, 68.8, 73.3,
                  27.5, 31.8, 34.0, 35.7, 37.3, 38.8, 40.5, 42.3, 44.2, 46.2, 48.1, 50.0, 51.9, 53.7, 55.8, 58.3, 64.2,
                  41.6, 49.6, 52.0, 53.9, 55.6, 57.1, 58.5, 59.9, 61.2, 62.5, 63.8, 65.2, 66.6, 68.0, 69.7, 71.6, 75.6)
  )

Promis_analysis = left_join(
  Promis_analysis,
  Promis_lookup,
  by = join_by("category_id","category_score")) %>%
  mutate(t_score = ifelse(category_id == "pain_intensity",category_score,t_score)) %>%
  mutate(z_score = (t_score-50)/10) %>%
  pivot_wider(
    id_cols = "id_recipient",
    names_from = "category_id",
    values_from = "z_score")

Promis_analysis = Promis_analysis %>%
  mutate(cognition = 0.00943 + 
           (-0.037 * depression) + 
           (0.118 * physicalFunctioning) + 
           (-0.223 * sleepDisturbance) + 
           (0.0505 * socialRoleParticipation) + 
           (-0.168 * anxiety) + 
           (-0.00599 * pain))



Promis_analysis = as.data.frame(do.call(rbind, apply(Promis_analysis, 1, function(x) propr.maut.function.201709(as.numeric(c(
  x[names(x)== "cognition"],
  x[names(x)== "depression"],
  x[names(x)== "fatigue"],
  x[names(x)== "pain"],
  x[names(x)== "physicalFunctioning"],
  x[names(x)== "sleepDisturbance"],
  x[names(x)== "socialRoleParticipation"]
)))))) %>% 
  pivot_longer(cols = everything(), names_to = "Category", values_to = "Utility")

Promis_analysis$Utility = as.numeric(Promis_analysis$Utility)

cat_levels <- sort(unique(Promis_analysis$Category))
cat_levels <- c(setdiff(cat_levels, "PROPr"), "PROPr")
Promis_analysis$Category <- factor(Promis_analysis$Category, levels = cat_levels)

p <- ggplot(Promis_analysis) + 
  geom_boxplot(aes(y = Utility, x = Category), color = get_viridis_color(1)) +
  scale_x_discrete(limits = rev) +
  guides(color = "none") +
  labs(y = "Utility / PROPr score") +
  theme_settings +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    axis.title.y = element_text(margin = margin(r = 8))
  ) + 
  coord_flip()

ggsave(
  plot = p,
  filename = paste0(
    "../Plots/Descriptives/PROPr_score.",
    dict$plot_device
  ),
  device = dict$plot_device,
  dpi = dict$plot_dpi,
  width = dict$desc_width,
  height = dict$desc_height,
  units = dict$desc_units
)
